Load required libraries
# Data wrangling
library(lubridate)
library(dplyr)
# For plotting
library(ggplot2)
# For regression tree
# library(MASS)
library(tree)
# For bagging and random forest models
library(randomForest)
### For the boosting model
library(gbm)
### For tuning models
library(caret)
Load and clean up data
load("data-raw/ListOfAllHeatwaves.Rdata")
fix_colnames <- which(colnames(all.hws) == "mean.temp")
colnames(all.hws)[fix_colnames[2]] <- "mean.temp.1"
hw <- all.hws %>%
mutate(start.year = year(start.date))
colnames(hw) <- gsub(" ", ".", colnames(hw))
Create dataframes for fitting models for y1 and y2
hw <- hw %>%
select(-hw.number, -start.date, -end.date, -id, -Std..Error,
-city, -Posterior.Estimate, -Posterior.SE, -post.se)
to_fit_y1 <- hw %>%
select(-post.estimate)
to_fit_y2 <- hw %>%
select(-Estimate)
# Create a random subset of heat waves to use to train models
set.seed(1001) ##generate fixed random numbers
hw_indices <- 1:nrow(hw)
train <- sort(sample(hw_indices, length(hw_indices) / 2))
head(train)
## [1] 2 3 5 6 9 11
test <- hw_indices[-train]
head(test)
## [1] 1 4 7 8 10 12
Fit a regression tree for \(Y_1\):
tree.hw <- tree(Estimate ~ ., data = to_fit_y1, subset = train)
summary(tree.hw)
##
## Regression tree:
## tree(formula = Estimate ~ ., data = to_fit_y1, subset = train)
## Variables actually used in tree construction:
## [1] "start.doy" "min.temp.quantile" "Ppoverty"
## Number of terminal nodes: 4
## Residual mean deviance: 0.03799 = 56.46 / 1486
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.37800 -0.09941 0.01519 0.00000 0.11770 0.85470
The variables included in this model are: start.doy, min.temp.quantile, Ppoverty. Here is a plot of the final tree for this model:
plot(tree.hw)
text(tree.hw, pretty = 0)
Here are the results of using cross-validation to decide whether (and how much) to prune the tree:
cv.hw <- cv.tree(tree.hw)
plot(cv.hw$size, cv.hw$dev, type='b')
Based on this analysis, the best tree size would be 4 nodes.
Using the unpruned tree, here are predictions on the holdout, training dataset:
yhat <- predict(tree.hw, newdata = to_fit_y1[test, ])
to_plot <- data.frame(estimated_y1 = yhat,
true_y1 = to_fit_y1[test, "Estimate"])
ggplot(to_plot, aes(x = estimated_y1, y = true_y1)) +
geom_point(alpha = 0.2) +
xlim(range(to_plot)) + ylim(range(to_plot)) +
theme_minimal() +
geom_abline(intercept = 0, slope = 1) +
xlab("Estimated Y_1") + ylab("True Y_1")
The mean squared error for this model was 1.332097110^{-5}.
Fit a bagging model:
set.seed(1)
bag.hw <- randomForest(Estimate ~ .,
data = to_fit_y1, subset = train,
mtry = ncol(to_fit_y1) - 1, importance = TRUE)
yhat.bag <- predict(bag.hw, newdata = to_fit_y1[test, ])
to_plot <- data.frame(estimated_y1 = yhat.bag,
true_y1 = to_fit_y1[test, "Estimate"])
ggplot(to_plot, aes(x = estimated_y1, y = true_y1)) +
geom_point(alpha = 0.2) +
xlim(range(to_plot)) + ylim(range(to_plot)) +
theme_minimal() +
geom_abline(intercept = 0, slope = 1) +
xlab("Estimated Y_1") + ylab("True Y_1")
The mean squared error for this model was 4.533258410^{-5}.
Here is more on the variable importance for this model:
importance(bag.hw)
## %IncMSE IncNodePurity
## mean.temp 7.127092 2.43573258
## max.temp 12.435896 1.93576641
## min.temp 10.426944 1.48729252
## length 6.082851 0.98018861
## start.doy 8.867151 8.44876104
## start.month 5.545108 0.53208653
## days.above.80 6.241774 0.64997029
## days.above.85 6.531809 0.29048742
## days.above.90 5.105985 0.21477811
## days.above.95 2.112760 0.03599527
## days.above.99th 2.167899 0.52560419
## days.above.99.5th 2.985387 0.29941594
## first.in.season 2.306548 0.72265838
## mean.temp.quantile 2.742567 3.99990699
## max.temp.quantile 2.970850 4.50393698
## min.temp.quantile 5.930149 4.25589726
## pop100 8.545890 3.04306541
## Ppoverty 8.651908 3.97522202
## Ppoverty75p 6.833760 2.09244802
## Purban 9.665649 1.51728436
## P75p 10.044029 1.88331991
## pop.density 9.685492 1.61661642
## mean.temp.1 7.041676 1.70637697
## mean.summer.temp 10.088439 1.43119714
## start.year 10.234676 7.86766924
varImpPlot(bag.hw)
Fitting a random forest model, with tuning:
# Use 10-fold cross validation for tuning to find the best `mtry`
fitControl <- trainControl(method = "cv", number = 10)
set.seed(1)
tuning.rf.hw <- train(Estimate ~ ., data = to_fit_y1, subset = train,
method = "rf", trControl = fitControl, ntree = 10,
importance = TRUE, metric="RMSE",
maximize = FALSE, tuneLength=5)
Here are the results from that tuning process:
tuning.rf.hw
## Random Forest
##
## 2980 samples
## 25 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1342, 1341, 1339, 1339, 1342, 1342, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.2041435 0.02051843
## 7 0.2081761 0.02619107
## 13 0.2025300 0.05091675
## 19 0.2022381 0.05765925
## 25 0.2036988 0.04836574
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 19.
plot(tuning.rf.hw)
Based on this tuning, the best value for mtry in the random forest model for this outcome is 19.
yhat.rf <- predict(tuning.rf.hw$finalModel, newdata = to_fit_y1[test, ])
to_plot <- data.frame(estimated_y1 = yhat.rf,
true_y1 = to_fit_y1[test, "Estimate"])
ggplot(to_plot, aes(x = estimated_y1, y = true_y1)) +
geom_point(alpha = 0.2) +
xlim(range(to_plot)) + ylim(range(to_plot)) +
theme_minimal() +
geom_abline(intercept = 0, slope = 1) +
xlab("Estimated Y_1") + ylab("True Y_1")
The mean squared error for this model was 1.093186510^{-5}.
Here are the variable importance plots:
importance(tuning.rf.hw$finalModel)
## %IncMSE IncNodePurity
## mean.temp 2.09145601 2.201291869
## max.temp 1.59144388 1.957183048
## min.temp 3.17867200 1.495273519
## length 1.38093988 1.646819207
## start.doy 1.50580173 6.749238062
## start.month 1.36149982 0.800102981
## days.above.80 1.99678188 0.934009110
## days.above.85 1.31846692 0.281415382
## days.above.90 1.68037566 0.284349229
## days.above.95 -1.05409255 0.000438401
## days.above.99th 1.46687174 0.617808370
## days.above.99.5th 2.03173084 0.443275791
## first.in.season -1.06282656 0.697164009
## mean.temp.quantile 1.99793943 4.278970715
## max.temp.quantile 2.38871728 4.321199142
## min.temp.quantile 3.10884927 3.762528078
## pop100 1.24118549 3.856162140
## Ppoverty -1.09203395 4.870776390
## Ppoverty75p 0.36232123 1.527727356
## Purban -0.85326202 1.498955910
## P75p 2.33854251 1.996748391
## pop.density -0.05911208 2.259959535
## mean.temp.1 3.07875722 1.907943862
## mean.summer.temp 1.75185187 2.953429050
## start.year -0.64510490 4.932279441
varImpPlot(tuning.rf.hw$finalModel)
Fit a boosting model:
#### Tuning to find best mtry for Boostingt
gbmgrid=expand.grid(.interaction.depth = seq(1, 7, by = 2),
.n.trees = seq(200,300,by = 50),
.shrinkage =0.01,
.n.minobsinnode=10)
set.seed(1)
## I've already tested shrinkage=0.1, which is not good as shrinkage=0.01, to speed the process, I cut 0.1 off.
tuning.boost.hw=train(Estimate~.,data=hw[train,],
method="gbm",
tuneGrid=gbmgrid,
trControl=fitControl,
verbose=FALSE)
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:lubridate':
##
## here
tuning.boost.hw
## Stochastic Gradient Boosting
##
## 1490 samples
## 26 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1342, 1341, 1339, 1339, 1342, 1342, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared
## 1 200 0.12337304 0.6630695
## 1 250 0.12028095 0.6677372
## 1 300 0.11854686 0.6699416
## 3 200 0.10794449 0.7426040
## 3 250 0.10367313 0.7528640
## 3 300 0.10124069 0.7588751
## 5 200 0.09906492 0.7865518
## 5 250 0.09424685 0.7967211
## 5 300 0.09121802 0.8043525
## 7 200 0.09346555 0.8114404
## 7 250 0.08827967 0.8213362
## 7 300 0.08519342 0.8276194
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.01
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 300,
## interaction.depth = 7, shrinkage = 0.01 and n.minobsinnode = 10.
plot(tuning.boost.hw)
## The plot has the optimal value at interaction.depth=5 and n.trees=200.
set.seed(1)
boost.hw=gbm(Estimate~.,data = hw[train,],distribution = "gaussian",n.trees=200,interaction.depth = 5)
summary(boost.hw)
## var rel.inf
## post.estimate post.estimate 88.52329512
## pop100 pop100 5.94188732
## Ppoverty Ppoverty 1.63204641
## length length 1.10142805
## max.temp.quantile max.temp.quantile 0.92279435
## start.doy start.doy 0.42099320
## Ppoverty75p Ppoverty75p 0.37233281
## days.above.80 days.above.80 0.33361266
## start.year start.year 0.19736944
## mean.temp.1 mean.temp.1 0.15951344
## Purban Purban 0.10269778
## min.temp.quantile min.temp.quantile 0.09067863
## mean.temp.quantile mean.temp.quantile 0.07248359
## mean.summer.temp mean.summer.temp 0.05772668
## mean.temp mean.temp 0.03613501
## pop.density pop.density 0.03500551
## max.temp max.temp 0.00000000
## min.temp min.temp 0.00000000
## start.month start.month 0.00000000
## days.above.85 days.above.85 0.00000000
## days.above.90 days.above.90 0.00000000
## days.above.95 days.above.95 0.00000000
## days.above.99th days.above.99th 0.00000000
## days.above.99.5th days.above.99.5th 0.00000000
## first.in.season first.in.season 0.00000000
## P75p P75p 0.00000000
par(mfrow =c(1,3))
plot( boost.hw ,i="pop100")
plot( boost.hw,i="Ppoverty")
plot(boost.hw,i="max.temp.quantile")
yhat.boost = predict(boost.hw, newdata = hw[-train ,],n.trees =200)
hw.test <- hw[-train, "Estimate"]
mean((yhat.boost - hw.test)^2)
## [1] 0.02738211
## MSE=0.03561743
#### If response is post.est
load("data-raw/ListOfAllHeatwaves.Rdata")
hw=data.frame(all.hws)
colnames(hw)[28]<="mean.temp.1"
## [1] FALSE
hw$start.date=year(all.hws$start.date)
hw=hw[,-c(1,7,20:22,31:33,35)]
### Single Tree
set.seed(1) ##generate fixed random numbers
train = sample (1: nrow (hw), nrow(hw)/2)
tree.hw = tree(post.estimate~.,hw, subset = train )
summary (tree.hw)
##
## Regression tree:
## tree(formula = post.estimate ~ ., data = hw, subset = train)
## Variables actually used in tree construction:
## [1] "mean.temp.quantile" "pop.density" "mean.temp"
## [4] "days.above.95" "max.temp" "Purban"
## [7] "start.doy"
## Number of terminal nodes: 8
## Residual mean deviance: 0.0007425 = 1.1 / 1482
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1008000 -0.0175300 -0.0008852 0.0000000 0.0159900 0.1242000
plot(tree.hw)
text(tree.hw,pretty=0)
cv.hw =cv.tree(tree.hw)
plot(cv.hw$size,cv.hw$dev,type='b')
## The plot tells us the best would be 4 node.
## Continue using Prune
prune.hw =prune.tree(tree.hw,best=4)
plot(prune.hw)
text(prune.hw,pretty =0)
## disregard pruned tree, but using unpruned tree
yhat=predict(tree.hw,newdata=hw[-train,])
hw.test=hw[-train ,'post.estimate']
plot(yhat,hw.test)
abline(0 ,1)
mean((yhat - hw.test)^2)
## [1] 0.0009294525
## MSE=0.0009294525 Really lower!
#### Bagging
library(randomForest)
set.seed(1)
bag.hw=randomForest(post.estimate~.,data=hw,subset=train,mtry=25,importance=TRUE)
yhat.bag=predict(bag.hw,newdata=hw[-train ,])
plot(yhat.bag,hw.test)
abline(0,1)
mean((yhat.bag - hw.test)^2)
## [1] 0.0009133906
## MSE=0.0009133906 does't change a lot
importance(bag.hw)
## %IncMSE IncNodePurity
## mean.temp 12.681832 0.06342193
## max.temp 10.172925 0.04724131
## min.temp 9.987601 0.03824045
## length 10.332287 0.04498540
## start.date 7.115077 0.09209639
## start.doy 7.543092 0.12534681
## start.month 3.338849 0.01133792
## days.above.80 10.430266 0.03254294
## days.above.85 5.168632 0.01499552
## days.above.90 1.654415 0.01457275
## days.above.95 1.419662 0.01664198
## days.above.99th 8.003545 0.02054583
## days.above.99.5th 5.606331 0.01505929
## first.in.season 4.850694 0.01800880
## mean.temp.quantile 20.771740 0.11276662
## max.temp.quantile 17.800269 0.09103319
## min.temp.quantile 8.327124 0.07477411
## pop100 7.404953 0.06842809
## Ppoverty 8.288051 0.03514406
## Ppoverty75p 6.296062 0.03107052
## Purban 10.532977 0.03156202
## P75p 4.419545 0.04774665
## pop.density 13.058998 0.04261892
## mean.temp.1 10.582677 0.03965594
## mean.summer.temp 13.930181 0.03465096
varImpPlot(bag.hw)
#### random forest
#### Tuning to find best mtry for Random Forest
set.seed(1)
tuning.rf.hw=train(post.estimate~.,
data=hw,subset=train,
method = "rf",
trControl = fitControl,
ntree = 10,
importance = TRUE,
metric="RMSE",
maximize = FALSE,
tuneLength=5)
tuning.rf.hw
## Random Forest
##
## 2980 samples
## 25 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1342, 1341, 1339, 1339, 1342, 1342, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.02952687 0.03094266
## 7 0.02955921 0.03628967
## 13 0.02975596 0.03614151
## 19 0.02994947 0.03070839
## 25 0.02946487 0.04046570
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 25.
plot(tuning.rf.hw)
#### the optimal mtry is 25.
set.seed(1)
rf.hw=randomForest(post.estimate~.,data=hw,subset=train,mtry=25,importance=TRUE)
yhat.bag=predict(rf.hw,newdata=hw[-train ,])
plot(yhat.bag,hw.test)
abline(0,1)
mean((yhat.bag - hw.test)^2)
## [1] 0.0009133906
## MSE=0.0009133906 doesn't change a lot
importance(rf.hw)
## %IncMSE IncNodePurity
## mean.temp 12.681832 0.06342193
## max.temp 10.172925 0.04724131
## min.temp 9.987601 0.03824045
## length 10.332287 0.04498540
## start.date 7.115077 0.09209639
## start.doy 7.543092 0.12534681
## start.month 3.338849 0.01133792
## days.above.80 10.430266 0.03254294
## days.above.85 5.168632 0.01499552
## days.above.90 1.654415 0.01457275
## days.above.95 1.419662 0.01664198
## days.above.99th 8.003545 0.02054583
## days.above.99.5th 5.606331 0.01505929
## first.in.season 4.850694 0.01800880
## mean.temp.quantile 20.771740 0.11276662
## max.temp.quantile 17.800269 0.09103319
## min.temp.quantile 8.327124 0.07477411
## pop100 7.404953 0.06842809
## Ppoverty 8.288051 0.03514406
## Ppoverty75p 6.296062 0.03107052
## Purban 10.532977 0.03156202
## P75p 4.419545 0.04774665
## pop.density 13.058998 0.04261892
## mean.temp.1 10.582677 0.03965594
## mean.summer.temp 13.930181 0.03465096
varImpPlot(rf.hw)
### boosting
#### Tuning to find best mtry for Boostingt
gbmgrid=expand.grid(.interaction.depth =seq(3,7,by=2),
.n.trees = c(200,250,300),
.shrinkage =0.01,
.n.minobsinnode=10)
### shrinkage=0.1 is too bad, the optimal depth is 7 and n.trees=300 after testing several combinations
set.seed(1)
tuning.boost.hw=train(post.estimate~.,data=hw[train,],
method="gbm",
tuneGrid=gbmgrid,
trControl=fitControl,
verbose=FALSE)
tuning.boost.hw
## Stochastic Gradient Boosting
##
## 1490 samples
## 25 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1342, 1341, 1339, 1339, 1342, 1342, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared
## 3 200 0.02800774 0.06218608
## 3 250 0.02796891 0.06305961
## 3 300 0.02793282 0.06466670
## 5 200 0.02787142 0.06951926
## 5 250 0.02782948 0.07136949
## 5 300 0.02779028 0.07275341
## 7 200 0.02774866 0.07740408
## 7 250 0.02771617 0.07814106
## 7 300 0.02770656 0.07820711
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.01
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 300,
## interaction.depth = 7, shrinkage = 0.01 and n.minobsinnode = 10.
plot(tuning.boost.hw)
set.seed(1)
boost.hw=gbm(post.estimate~.,data = hw[train,],distribution = "gaussian",n.trees=300,interaction.depth = 7)
summary(boost.hw)
## var rel.inf
## mean.temp.quantile mean.temp.quantile 15.91038293
## start.doy start.doy 10.41904970
## pop100 pop100 7.08397301
## pop.density pop.density 6.98980330
## mean.summer.temp mean.summer.temp 6.77432345
## start.date start.date 6.04210733
## length length 5.89611867
## mean.temp.1 mean.temp.1 5.70701630
## max.temp max.temp 4.94489007
## max.temp.quantile max.temp.quantile 4.86514295
## mean.temp mean.temp 3.32831254
## min.temp.quantile min.temp.quantile 3.19474366
## P75p P75p 3.04572945
## Purban Purban 3.02447910
## Ppoverty Ppoverty 2.94070082
## days.above.80 days.above.80 2.52964834
## min.temp min.temp 1.75729102
## days.above.85 days.above.85 1.16761793
## days.above.99th days.above.99th 1.11275935
## first.in.season first.in.season 1.08215392
## days.above.90 days.above.90 1.06099544
## Ppoverty75p Ppoverty75p 0.76771776
## days.above.95 days.above.95 0.26854035
## start.month start.month 0.08650262
## days.above.99.5th days.above.99.5th 0.00000000
par(mfrow =c(1,2))
plot( boost.hw ,i="mean.temp.quantile")
plot( boost.hw,i="start.doy")
yhat.boost = predict(boost.hw, newdata = hw[-train ,],n.trees =300)
mean((yhat.boost - hw.test)^2)
## [1] 0.0009912703
## MSE=0.0009912703